Dictionary Usage

Dictionary Usage as a Lookup Table

sine = [-0.0000, -0.0495, -0.0988, -0.1479,
        -0.1966, -0.2449, -0.2925, -0.3394,
        -0.3855, -0.4307, -0.4748, -0.5177,
        -0.5594, -0.5997, -0.6386, -0.6758,
        -0.7115, -0.7453, -0.7774, -0.8076,
        -0.8357, -0.8619, -0.8859, -0.9078,
        -0.9274, -0.9448, -0.9598, -0.9725,
        -0.9828, -0.9908, -0.9963, -0.9993,
        -0.9999, -0.9981, -0.9938, -0.9871,
        -0.9780, -0.9665, -0.9526, -0.9364,
        -0.9179, -0.8971, -0.8742, -0.8491,
        -0.8219, -0.7927, -0.7616, -0.7286,
        -0.6939, -0.6574, -0.6193, -0.5798,
        -0.5387, -0.4964, -0.4529, -0.4082,
        -0.3626, -0.3161, -0.2688, -0.2208,
        -0.1723, -0.1234, -0.0741, -0.0247,
         0.0247,  0.0741,  0.1234,  0.1723,
         0.2208,  0.2688,  0.3161,  0.3626,
         0.4082,  0.4529,  0.4964,  0.5387,
         0.5798,  0.6193,  0.6574,  0.6939,
         0.7286,  0.7616,  0.7927,  0.8219,
         0.8491,  0.8742,  0.8971,  0.9179,
         0.9364,  0.9526,  0.9665,  0.9780,
         0.9871,  0.9938,  0.9981,  0.9999,
         0.9993,  0.9963,  0.9908,  0.9828,
         0.9725,  0.9598,  0.9448,  0.9274,
         0.9078,  0.8859,  0.8619,  0.8357,
         0.8076,  0.7774,  0.7453,  0.7115,
         0.6758,  0.6386,  0.5997,  0.5594,
         0.5177,  0.4748,  0.4307,  0.3855,
         0.3394,  0.2925,  0.2449,  0.1966,
         0.1479,  0.0988,  0.0495,  0.0000,]
from math import pi

angle = 0                               # offset in the table
angle_index = int((pi + angle) / (2 * pi) * (len(sine) - 1))
print(angle_index)                      # 63

sine[angle_index]                       # -0.0247 - bad accuracy, should be 0.0

from math import sin
sin(angle)                              # 0.0

sine[angle_index-1], sine[angle_index], sine[angle_index+1]  # (-0.0741, -0.0247, 0.0247)

# Create a table of sine values
table = {round(2*pi*i/(len(sine) -1) - pi, 4): sine[i] for i in range(len(sine))}
table = {-3.1416: -0.0000, -3.0921: -0.0495, -3.0426: -0.0988, -2.9932: -0.1479,
         -2.9437: -0.1966, -2.8942: -0.2449, -2.8447: -0.2925, -2.7953: -0.3394,
         -2.7458: -0.3855, -2.6963: -0.4307, -2.6469: -0.4748, -2.5974: -0.5177,
         -2.5479: -0.5594, -2.4984: -0.5997, -2.4490: -0.6386, -2.3995: -0.6758,
         -2.3500: -0.7115, -2.3005: -0.7453, -2.2511: -0.7774, -2.2016: -0.8076,
         -2.1521: -0.8357, -2.1026: -0.8619, -2.0532: -0.8859, -2.0037: -0.9078,
         -1.9542: -0.9274, -1.9047: -0.9448, -1.8553: -0.9598, -1.8058: -0.9725,
         -1.7563: -0.9828, -1.7068: -0.9908, -1.6574: -0.9963, -1.6079: -0.9993,
         -1.5584: -0.9999, -1.5090: -0.9981, -1.4595: -0.9938, -1.4100: -0.9871,
         -1.3605: -0.9780, -1.3111: -0.9665, -1.2616: -0.9526, -1.2121: -0.9364,
         -1.1626: -0.9179, -1.1132: -0.8971, -1.0637: -0.8742, -1.0142: -0.8491,
         -0.9647: -0.8219, -0.9153: -0.7927, -0.8658: -0.7616, -0.8163: -0.7286,
         -0.7668: -0.6939, -0.7174: -0.6574, -0.6679: -0.6193, -0.6184: -0.5798,
         -0.5689: -0.5387, -0.5195: -0.4964, -0.4700: -0.4529, -0.4205: -0.4082,
         -0.3711: -0.3626, -0.3216: -0.3161, -0.2721: -0.2688, -0.2226: -0.2208,
         -0.1732: -0.1723, -0.1237: -0.1234, -0.0742: -0.0741, -0.0247: -0.0247,
          0.0247:  0.0247,  0.0742:  0.0741,  0.1237:  0.1234,  0.1732:  0.1723,
          0.2226:  0.2208,  0.2721:  0.2688,  0.3216:  0.3161,  0.3711:  0.3626,
          0.4205:  0.4082,  0.4700:  0.4529,  0.5195:  0.4964,  0.5689:  0.5387,
          0.6184:  0.5798,  0.6679:  0.6193,  0.7174:  0.6574,  0.7668:  0.6939,
          0.8163:  0.7286,  0.8658:  0.7616,  0.9153:  0.7927,  0.9647:  0.8219,
          1.0142:  0.8491,  1.0637:  0.8742,  1.1132:  0.8971,  1.1626:  0.9179,
          1.2121:  0.9364,  1.2616:  0.9526,  1.3111:  0.9665,  1.3605:  0.9780,
          1.4100:  0.9871,  1.4595:  0.9938,  1.5090:  0.9981,  1.5584:  0.9999,
          1.6079:  0.9993,  1.6574:  0.9963,  1.7068:  0.9908,  1.7563:  0.9828,
          1.8058:  0.9725,  1.8553:  0.9598,  1.9047:  0.9448,  1.9542:  0.9274,
          2.0037:  0.9078,  2.0532:  0.8859,  2.1026:  0.8619,  2.1521:  0.8357,
          2.2016:  0.8076,  2.2511:  0.7774,  2.3005:  0.7453,  2.3500:  0.7115,
          2.3995:  0.6758,  2.4490:  0.6386,  2.4984:  0.5997,  2.5479:  0.5594,
          2.5974:  0.5177,  2.6469:  0.4748,  2.6963:  0.4307,  2.7458:  0.3855,
          2.7953:  0.3394,  2.8447:  0.2925,  2.8942:  0.2449,  2.9437:  0.1966,
          2.9932:  0.1479,  3.0426:  0.0988,  3.0921:  0.0495,  3.1416:  0.0000}
Use interpolation for values missed in the table
from bisect import bisect
# from __future__ import division

class interpolating_dict(dict):
    def __missing__(self, key):
        sorted_keys = sorted(self.keys())

        index = bisect(sorted_keys, key)

        if index == 0 or index == len(sorted_keys):
            raise KeyError('cannot extrapolate value {}'.format(key))

        left_key, right_key = sorted_keys[index-1], sorted_keys[index]
        left_val, right_val = self[left_key], self[right_key]

        slope = (right_val - left_val) / (right_key - left_key)
        self[key] = value = slope * (key - left_key) + left_val
        return value

# New sine object
sine = interpolating_dict(table)
print(sine[-3.1416])      # -0.0 - OK
print(sine[0])            # 0.0  - new interpolated value, saved in the table
print(sine[pi/2])         # 0.9997497414933952 - interpolated and saved in the table
print(sine[pi/4])         # 0.7069375004018476 - interpolated and saved in the table

print(sine)
# {-3.1416: -0.0,
#  -3.0921: -0.0495,
#  -3.0426: -0.0988,
#  . . .
#  3.0426: 0.0988,
#  3.0921: 0.0495,
#  3.1416: 0.0,
#  0: 0.0,
#  1.5707963267948966: 0.9997497414933952,
#  0.7853981633974483: 0.7069375004018476}

Dictionary usage as a Relation or a Function

from math import sqrt

def roots(a,b,c):
    return (-b + sqrt(b**2 - 4*a*c))/(2*a)

Test:

print(roots(1,2,1)) # (x+1)(x+1) == x? + 2x + 1    # -1.0
class rootdict(dict): # ???
    def __missing__(self, key):
        a, b, c = key
        return (-b + sqrt(b**2 - 4*a*c))/(2*a)

Test:

d = rootdict()
print(d[1,2,1])                               # -1.0
print(d[2,4,2])                               # -1.0
from sys import version_info
assert version_info.major == 3 and version_info.minor >= 3, \
    'requires PEP 362; Python 3.3 or later; python.org/dev/peps/pep-0362/'

from inspect import signature

class memoise(dict):

    def __init__(self, func):
        self.func, self.signature = func, signature(func)

    def __missing__(self, key):
        args, kwargs = key
        self[key] = self.func(*args, **dict(kwargs))
        return self[key]

    def __call__(self, *args, **kwargs):
        key = self.signature.bind(*args, **kwargs)
        return self[key.args, frozenset(key.kwargs.items())]

from time import sleep

@memoise                               # use decorator
def roots(a,b,c):
    sleep(1)
    return (-b + sqrt(b**2 - 4*a*c))/(2*a)

%time roots(1,2,1)                     # Wall time: 1 s; -1.0
%time roots(1,2,1)                     # Wall time: 0 ns; -1.0   <== 0 time for known result
roots(2,4,2)                           # -1.0
print(roots)                           # {((1, 2, 1), frozenset()): -1.0, ((2, 4, 2), frozenset()): -1.0}
                                       # it's a dictionary
                                       # for a given arguments the results are saved !!!
                                       # and we don't make computation again

Protocols: __getitem__ and __call__

d = {1: 1, 2: 4}
print(d[2])                            # 4
print(d.__getitem__(2))                # 4

class Foo:
    def __getitem__(self, key):
        return key**2
foo = Foo()
print(foo[13])                         # [ key ] -> __getitem__ => 169 value
class Foo:
    def __call__(self, val):
        return val**2
foo = Foo()
print(foo(10))                         # ( val ) -> __call__ => 100

dict.__missing__

from collections import Iterable

class rangedict(dict):
    def __missing__(self, key):
        for k, v in self.items():
            if isinstance(k, Iterable):
                left, right = k
                if left <= key < right:
                    self[key] = v
                    return v
        raise KeyError('cannot find {} in rangedict'.format(key))

codes = rangedict({(  0,   10): 'red',
                   ( 10,  100): 'yellow',
                   (100, 1000): 'green', })

print(codes[105])                      # 'green'
print(codes)                           # {(0, 10): 'red', (10, 100): 'yellow', (100, 1000): 'green', 105: 'green'}
class passthrudict(dict):
    def __missing__(self, key):
        return key

censor = passthrudict({'hell': 'h***',
                      'darn': 'd*rn',})
sentence = "That darn cat!"
print(' '.join(censor[w] for w in sentence.split())) # 'That d*rn cat!'

sentence = "Y'all can go to hell ; I'm going to Texas!"
print(' '.join(censor[w] for w in sentence.split())) # "Y'all can go to hell; I'm going to Texas!" - no split!!!

# Equivalent to .get() but syntax is easier.

# Fix split issue

from itertools import groupby, chain
from unicodedata import category

def fancy_split(s):
    ''' Take string, group by unicode category (L - letters) '''
    return [''.join(g) for k, g in groupby(s, key=lambda x: category(x)[0] == 'L')]

Test:

sentence = "Y'all can go to hell; I'm going to Texas!"
print(fancy_split(sentence))

Output:

['Y',
 "'",
 'all',
 ' ',
 'can',
 ' ',
 'go',
 ' ',
 'to',
 ' ',
'hell',
'; ',
'I',
 "'",
 'm',
 ' ',
 'going',
 ' ',
 'to',
 ' ',
 'Texas',
 '!']
print(''.join(censor[w] for w in fancy_split(sentence)))

Output:

"Y'all can go to h***; I'm going to Texas!"

collections.defaultdict

student_courses = {'vinnie':    {'calculus', 'diff eq'},
                   'arnold':    {'calculus', 'linear algebra'},
                   'juan luis': {'real analysis'}}

# student_courses['freddie'].add('linear algebra')     # Error

student_courses['freddie'] = set()
student_courses['freddie'].add('linear algebra')

Use helper called defaultdict

from collections import defaultdict

student_courses = defaultdict(set)

student_courses.update({'vinnie':    {'calculus', 'diff eq'},
                        'arnold':    {'calculus', 'linear algebra'},
                        'juan luis': {'real analysis'},
                       }
                      )

print(student_courses['freddy'])       # set()

student_courses['freddie'].add('linear algebra')   # create set and add to the set
print(student_courses)

Output:

defaultdict(set,
           {'vinnie': {'calculus', 'diff eq'},
            'arnold': {'calculus', 'linear algebra'},
            'juan luis': {'real analysis'},
            'freddie': {'linear algebra'}}
but we can not check if element does exist!
We always have a result
student_courses['foobar']              # set(), always
'foobar' in student_courses            # False

Use Dictionary as a Hash

Dict type implemented as Hash table
english = ['one', 'two', 'three']
spanish = ['uno', 'dos', 'tres']

print(english.index('two'))            # 1
print(spanish[english.index('three')]) # 'tres' => slow

from bisect import bisect_left

lookup = [en for en, es in sorted(zip(english, spanish))]
value  = [es for en, es in sorted(zip(english, spanish))]

key = 'one'
print(value[bisect_left(lookup, key)]) # 'uno'

Performance testing

from numpy import array, linspace
from numpy.random import choice, shuffle
from string import ascii_lowercase
from bisect import bisect_left

MAX_SIZE = 10**4

letters = array([c for c in ascii_lowercase])
words   = choice(letters, size=(MAX_SIZE, 10))
words   = array([''.join(w) for w in words])

ns = linspace(1, MAX_SIZE, 500)
list_times = []
dict_times = []
bisect_times = []

for n in ns:
    shuffle(words)

    sample_list = list(words[:int(n)])
    sample_dict = dict.fromkeys(sample_list)
    sample_bisect = sorted(sample_list)

    lookup_words = choice(sample_list, size=10)

    # List indexing
    list_time   = %timeit -q -n1 -o [sample_list.index(w) for w in lookup_words]
    # Dict  mechanizm
    dict_time   = %timeit -q -n1 -o [sample_dict[w] for w in lookup_words]
    # Bisect mechanizm
    bisect_time = %timeit -q -n1 -o [bisect_left(sample_bisect, w) for w in lookup_words]

    list_times.append(list_time.best)
    dict_times.append(dict_time.best)
    bisect_times.append(bisect_time.best)


%matplotlib inline

from matplotlib.pyplot import figure, plot, title, legend, ylabel, xlabel, show

fig = figure(figsize=(12,8))

# plot(ns, list_times, 'bo',   label=u'list lookup ~ O(n)')       # so much slower
plot(ns, dict_times, 'go',   label=u'dict lookup ~ O(1)')         # the best
plot(ns, bisect_times, 'ro', label=u'bisect lookup ~ O(log(n))')

title('list vs dict vs bisect (binary-search) lookup')
legend(loc='best')
xlabel('input size (elements)', figure=fig)
ylabel('best time out of 3 (s)', figure=fig)
show()

dict vs object

class Foo:                                            bar = {}
    pass
foo = Foo()
foo.x = 10                                            bar['y'] = 10
print(foo.x)             # 10                         print(bar['y'])     # 10
                                                    from operator import getitem
print(getattr(foo, 'x')) # "getattr protocol" => 10   print(getitem(bar, 'y'))     # "getitem protocol" => 10
foo.__dict__
{'x': 10}

class Base:
    z = 10
class Derived(Base):
    pass
d = Derived()
print(d.__dict__)        # {}
print(getattr(d, 'z'))   # 10

Performance: object vs dictionary

class Foo:
    x = 10
foo = Foo()
foo.z = 10

bar = {'y': 20}

%timeit foo.z
%timeit bar['y']

# almost the same time
# 55.3 ns +/- 1.33 ns per loop (mean +/- std. dev. of 7 runs, 10000000 loops each)
# 47.7 ns +/- 0.124 ns per loop (mean +/- std. dev. of 7 runs, 10000000 loops each)

Performance: __getitem__ vs __getattr__ vs __getattribute__

class Foo:
    def __getitem__(self, key):
        if key == 'x':
            return 10
        raise KeyError('no such key {}'.format(key))
    def __getattr__(self, attr):
        if attr == 'x':
            return 100
        raise AttributeError('no such attr {}'.format(attr))
foo = Foo()

%timeit foo.x
%timeit foo['x']

# __getitem__ is faster then __getattr__
# 515 ns +/- 9.13 ns per loop (mean +/- std. dev. of 7 runs, 1000000 loops each)
# 157 ns +/- 0.307 ns per loop (mean +/- std. dev. of 7 runs, 10000000 loops each)

# but if we use __getattribute__ instead of __getattr__:

class Foo:
    def __getitem__(self, key):
        if key == 'x':
            return 10
        raise KeyError('no such key {}'.format(key))

    def __getattribute__(self, attr):
        if attr == 'x':
            return 100
        raise AttributeError('no such attr {}'.format(attr))

foo = Foo()

%timeit foo.x
%timeit foo['x']

# performance is the same
# 144 ns +/- 2.58 ns per loop (mean +/- std. dev. of 7 runs, 10000000 loops each)
# 158 ns +/- 0.0948 ns per loop (mean +/- std. dev. of 7 runs, 10000000 loops each)

Semantics: objects vs dict

function == dictionaty
__call__ == __getitem__ protocol
__getitem__ == __getattr__ protocol

Distinctions

? not use __missing__ ? to avoid complexity
Other approaches:
class Foo:
    def __getitem__(self, key):
        return key
    def __getattr__(self, attr):
        return attr

foo = Foo()

# __getitem__
print(foo['two words'], foo['123'], foo['abc/def'])   # ('two words', '123', 'abc/def')

# __getattr__ protocol
# foo.two words  # Error - attr has space
# foo.123        # Error - attr started with numbers
# foo.abc/def    # Error - attr with spec chars

=> restriction on name of the attribute in objects

Examples: obj vs dict

1. simple class

class Foo:
    def __init__(self):
        self.x = 10
foo = Foo()
print(foo.x, foo.__dict__['x'])      # 10 10 - 2 ways: .x and dict lookup

2. lookup

class Base:
    x = 10
class Derived(Base):
    pass
d = Derived()
Base.y = 100
print(d.y)                           # 100

# do the same (hierarchical lookup) in dict:

from collections import ChainMap
b = {}
d = ChainMap({}, b)                   # chains empty dict with b
b['x'] = 10
print(d['x'])                         # 10
b['x'] = 100
print(d['x'])                         # 100
d['x'] = 500
print(d['x'])                         # 500 - first dict in list with element

3. multiple derived class

class BaseA:
    x = 10
class BaseB:
    x = 100
    y = 200
class Derived(BaseA, BaseB):
    pass
d = Derived()
print(d.x, d.y)                       # (10, 200)

# the same in dict:

ba = {'x': 10}
bb = {'x': 100, 'y': 200}
d = ChainMap({}, ba, bb)
print(d['x'], d['y'])                 # (10, 200)

4. we can costruct ChainMap from __mro__ and get the same name resolution

print(Derived.__mro__)                # (__main__.Derived, __main__.BaseA, __main__.BaseB, object)

d = ChainMap({}, *(entry.__dict__ for entry in Derived.__mro__))
print(d['x'], d['y'])                 # (10, 200)

5. with instance state self.x = x

class Foo:
    def __init__(self, x):
        self.x = x
foo = Foo(10)
print(foo.x)                          # 10

6.

class Foo:
    def __init__(self, x):
        self._x = x

    @property
    def x(self):
        self._x += 1
        return self._x

foo = Foo(10)
print(foo.x, foo.x, foo.x)             # 11 12 13

# same with dictionary (we can not hide the behaviour in dict):

class Foo:
    def __init__(self, x):
        self._x = x

    def __getitem__(self, key):
        if key == 'x':
            self._x += 1
            return self._x
        raise KeyError('no such key {}'.format(key))

foo = Foo(10)
print(foo['x'], foo['x'], foo['x'])     # 11 12 13

attribute dictionary

dict with attribute lookup to get contents
attribute lookup and item lookup in one object
class attrdict(dict):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__dict__ = self               # assign dict to itself

quux = attrdict()

quux['w'] = 100                            # item syntax
print(quux.w)                              # 100

quux.w = 200                               # attribute syntax
print(quux['w'])                           # 200

# ATTN!!! Be careful with next statement on Windows:

from itertools import count
try:
    for num in count(1):
        attrdict({i: i for i in range(1024*1024)})
except MemoryError:
    print('Out of memory after creating {} attrdicts'.format(num))

# and perform GC immediatelly

from gc import collect; collect()             # 5205

# better formulation (without memory circle)

class attrdict(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

try:
    for num in range(1, 150 + 1):
        attrdict({i: i for i in range(1024*1024)})
except MemoryError:
    print('Out of memory after creating {} attrdicts'.format(num))

print('Survived creating {} attrdicts'.format(num + 1))

Resume:

It's better do not combine attribute lookup and item lookup in one object!!!